/*
 * Copyright (C) 2011-2021 Intel Corporation. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 *   * Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in
 *     the documentation and/or other materials provided with the
 *     distribution.
 *   * Neither the name of Intel Corporation nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/// @example memory_format_propagation.cpp
/// This example demonstrates memory format propagation, which is critical for
/// deep learning applications performance.
///
/// > Annotated version: @ref memory_format_propagation_cpp

#include <iostream>
#include <sstream>
#include <string>

/// @page memory_format_propagation_cpp Memory format propagation
/// > Example code: @ref memory_format_propagation.cpp
///
/// Format propagation is one of the central notions that needs to be
/// well-understood to use DNNL correctly.
///
/// Convolution and inner product primitives choose the memory format when you
/// create them with the placeholder memory format
/// #dnnl::memory::format_tag::any for input or output. The memory format
/// chosen is based on different circumstances such as hardware and
/// convolutional parameters. Using the placeholder memory format is the
/// recommended practice for convolutions, since they are the most
/// compute-intensive operations in most topologies where they are present.
///
/// Other primitives, such as Elementwise, LRN, batch normalization and other,
/// on forward propagation should use the same memory format as the preceding
/// layer thus propagating the memory format through multiple DNNL primitives.
/// This avoids unnecessary reorders which may be expensive and should be
/// avoided unless a compute-intensive primitive requires a different format.
/// For performance reasons, backward computations of such primitives requires
/// consistent memory format with the corresponding forward computations.
/// Hence, when initializing there primitives for backward computations you
/// should use #dnnl::memory::format_tag::any memory format tag as well.
///
/// Below is the short summary when to use and not to use memory format
/// #dnnl::memory::format_tag::any during operation description initialization:
///
/// | Primitive Kinds                                                                                                               | Forward Propagation                                                                               | Backward Propagation                                                                                | No Propagation                                                                                    |
/// | :--                                                                                                                           | :--                                                                                               | :--                                                                                                 | :--                                                                                               |
/// | Compute intensive: (De-)convolution, Inner product, RNN                                                                       | Use #dnnl::memory::format_tag::any                                                                | Use #dnnl::memory::format_tag::any                                                                  | N/A                                                                                               |
/// | Memory-bandwidth limited: Pooling, Layer and Batch Normalization, Local Response Normalization, Elementwise, Shuffle, Softmax | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs | Use #dnnl::memory::format_tag::any for gradient tensors, and actual memory formats for data tensors | N/A                                                                                               |
/// | Memory-bandwidth limited: Reorder, Concat, Sum, Binary                                                                        | N/A                                                                                               | N/A                                                                                                 | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs |
///
/// Additional format synchronization is required between forward and backward
/// computations when running training workloads. This topic is covered in
/// [Training-Specific Aspects](@ref dev_guide_inference_and_training_aspects_training).
///
/// For better understanding of the architecture and design of DNNL
/// as well as the concepts used in the library, please refer to @ref
/// dev_guide_understanding_memory_formats.
///
/// @section memory_format_propagation_intro Introduction to the tutorial
///
/// This C++ API example demonstrates how to use optimized memory formats
/// supported by DNNL:
/// - How to configure primitives to use optimized memory formats.
/// - How to determine whether data needs to be reordered from/to optimized
///   memory formats.
///
/// This tutorial assumes that the reader has already reviewed the
/// @ref getting_started_cpp tutorial.
///
/// The example is built around a CNN consisting of a convolution followed by
/// a pooling and consists of the following steps:
/// 1. Create a pooling primitive descriptor based on the memory format chosen
///    by the convolution primitive.
/// 2. Create memory descriptors for input and output data in the NCHW memory
///    format.
/// 3. Determine if input and output data needs to be reordered from/to the
///    optimized memory format.
/// 4. Create memory objects; and necessary primitives and execute them.
///
/// These steps are implemented in the @ref memory_format_propagation_tutorial
/// which in turn is called from `main()` which is also responsible for error
/// handling.

#include "dnnl.hpp"
#include "dnnl_debug.h"

#include "example_utils.hpp"

using namespace dnnl;

/// @page memory_format_propagation_cpp
/// @section memory_format_propagation_tutorial memory_format_propagation() function
/// @page memory_format_propagation_cpp
static void memory_format_propagation_tutorial(engine::kind engine_kind) {
    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub1 Initialization
    ///
    /// We start by creating an engine and a stream that we will use when
    /// creating primitive descriptors and executing primitives.
    ///
    /// @snippet memory_format_propagation.cpp Initialize engine and stream
    // [Initialize engine and stream]
    engine eng(engine_kind, 0);
    stream s(eng);
    // [Initialize engine and stream]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub2 Create convolution and pooling primitives
    ///
    /// To specify that a primitive should pick an optimized format for the
    /// specified computation parameters, we create memory descriptors with
    /// memory format set to @ref dnnl::memory::format_tag::any.
    ///
    /// This approach works only for a limited set of primitives: convolutions
    /// and inner products. Additionally, @ref dnnl::memory::format_tag::any
    /// can be specified for destination memory descriptors which implies that
    /// destination will have the same memory format as the source.
    ///
    /// @snippet memory_format_propagation.cpp Create placeholder memory descriptors
    // [Create placeholder memory descriptors]
    // Tensor and kernel dimensions. We use the same 3x3 kernel with padding=1
    // for both convolution and pooling primitives, which means that the
    // activation tensor shapes do not change.
    const int N = 1, H = 14, W = 14, IC = 256, OC = IC, KH = 3, KW = 3;
    auto conv_src_md = memory::desc({N, IC, H, W}, memory::data_type::f32,
            memory::format_tag::any // let convolution choose memory format
    );
    auto conv_weights_md = memory::desc(
            {IC, OC, KH, KW}, memory::data_type::f32,
            memory::format_tag::any // let convolution choose memory format
    );
    auto conv_dst_md = conv_src_md; // shape does not change
    auto pool_dst_md = conv_dst_md; // shape does not change
    // [Create placeholder memory descriptors]

    /// @page memory_format_propagation_cpp
    ///
    /// Next, we pass the memory descriptors to primitive descriptors
    /// constructors.
    ///
    // @snippet memory_format_propagation.cpp Create convolution and pooling primitive descriptors
    // [Create convolution and pooling primitive descriptors]
    auto conv_pd = convolution_forward::primitive_desc(
            {prop_kind::forward_inference, algorithm::convolution_auto,
                    conv_src_md, conv_weights_md,
                    conv_dst_md, // shape information
                    {1, 1}, // strides
                    {1, 1}, {1, 1}}, // left and right padding
            eng);
    auto pool_pd = pooling_forward::primitive_desc(
            {prop_kind::forward_inference, algorithm::pooling_max,
                    conv_pd.dst_desc(), pool_dst_md, // shape information
                    {1, 1}, {KH, KW}, // strides and kernel
                    {1, 1}, {1, 1}}, // left and right padding
            eng);
    // [Create convolution and pooling primitive descriptors]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub3 Create source and destination memory objects
    ///
    /// We assume that the 'user' source and destination memory format is
    /// NCHW. Since there is no result validation in this tutorial, we do not
    /// bother with filling the data with some values and let DNNL
    /// allocate the memory.
    ///
    /// @snippet memory_format_propagation.cpp Create source and destination memory objects
    // [Create source and destination memory objects]
    auto src_mem = memory(
            {{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
            eng);
    auto weights_mem = memory({{IC, OC, KH, KW}, memory::data_type::f32,
                                      memory::format_tag::oihw},
            eng);
    auto dst_mem = memory(
            {{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
            eng);
    // [Create source and destination memory objects]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub4 Determine if source and destination need to be reordered
    ///
    /// The idiomatic way to check if a reorder is necessary between the memory
    /// format expected a primitive (the convolution in our case) and the
    /// available memory format is to compare the corresponding memory
    /// descriptors.
    ///
    /// @snippet memory_format_propagation.cpp Determine if source needs to be reordered
    // [Determine if source needs to be reordered]
    bool need_reorder_src = conv_pd.src_desc() != src_mem.get_desc();
    // [Determine if source needs to be reordered]

    /// @page memory_format_propagation_cpp
    ///
    /// @warning It is by design that it is not possible to just compare
    /// memory tags. The reason behind this is that a memory format tags only
    /// provide a partial description of how data is laid out in memory and do
    /// not, for example, describe memory objects obtained via sub-memory
    /// constructor.
    ///
    /// We repeat the process for the weights and destination memory format
    /// descriptors as well.
    ///
    /// @snippet memory_format_propagation.cpp Determine if weights and destination need to be reordered
    // [Determine if weights and destination need to be reordered]
    bool need_reorder_weights
            = conv_pd.weights_desc() != weights_mem.get_desc();
    bool need_reorder_dst = conv_pd.dst_desc() != dst_mem.get_desc();
    // [Determine if weights and destination need to be reordered]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub45 Allocate intermediate buffers if necessary
    ///
    /// Based on the flags computed before, we can now decide if we need extra
    /// intermediate buffers to hold the source and weights data for the
    /// convolution and the output of the pooling.
    ///
    /// Memory objects for the intermediate buffers are created based on the
    /// memory descriptors obtained from the primitive descriptors to ensure
    /// consistency.
    ///
    /// @snippet memory_format_propagation.cpp Allocate intermediate buffers if necessary
    // [Allocate intermediate buffers if necessary]
    auto conv_src_mem
            = need_reorder_src ? memory(conv_pd.src_desc(), eng) : src_mem;
    auto conv_weights_mem = need_reorder_weights
            ? memory(conv_pd.weights_desc(), eng)
            : weights_mem;
    auto conv_dst_mem = memory(conv_pd.dst_desc(), eng);
    auto pool_dst_mem
            = need_reorder_dst ? memory(pool_pd.dst_desc(), eng) : dst_mem;
    // [Allocate intermediate buffers if necessary]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub5 Perform reorders for source data if necessary
    ///
    /// Now we get to the part where we actually start executing things. We
    /// check if reorders are necessary based on the flags computed before and
    /// create and execute them immediately.
    ///
    /// @note We call @ref dnnl::stream::wait() before reorder primitives
    /// get out of scope and destroyed to accommodate for potentially
    /// asynchronous execution.
    ///
    /// @snippet memory_format_propagation.cpp Perform reorders for source data if necessary
    // [Perform reorders for source data if necessary]
    if (need_reorder_src) {
        auto reorder_src = reorder(src_mem, conv_src_mem);
        reorder_src.execute(
                s, {{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, conv_src_mem}});
        s.wait(); // wait for the reorder to complete
    }

    if (need_reorder_weights) {
        auto reorder_weights = reorder(weights_mem, conv_weights_mem);
        reorder_weights.execute(s,
                {{DNNL_ARG_FROM, weights_mem},
                        {DNNL_ARG_TO, conv_weights_mem}});
        s.wait(); // wait for the reorder to complete
    }
    // [Perform reorders for source data if necessary]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub6 Create and execute convolution and pooling primitives
    ///
    /// After the reorders, we are now ready to compute convolution and
    /// pooling.
    ///
    /// @snippet memory_format_propagation.cpp Create and execute convolution and pooling primitives
    // [Create and execute convolution and pooling primitives]
    auto conv_scratchpad_mem = memory(conv_pd.scratchpad_desc(), eng);
    auto conv = convolution_forward(conv_pd);
    conv.execute(s,
            {{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem},
                    {DNNL_ARG_DST, conv_dst_mem}});
    auto pool_scratchpad_mem = memory(pool_pd.scratchpad_desc(), eng);
    auto pool = pooling_forward(pool_pd);
    pool.execute(
            s, {{DNNL_ARG_SRC, conv_dst_mem}, {DNNL_ARG_DST, pool_dst_mem}});
    s.wait();
    // [Create and execute convolution and pooling primitives]

    /// @page memory_format_propagation_cpp
    /// @subsection memory_format_propagation_sub7 Reorder destination data if necessary
    ///
    /// The only potentially remaining operation is a reorder from the pooling
    /// destination memory object to the users's one.  Similarly to the
    /// reorders for the source and weights memory objects, it is performed
    /// depending on the value of the previously computed flag.
    ///
    /// @snippet memory_format_propagation.cpp Reorder destination data if necessary
    // [Reorder destination data if necessary]
    if (need_reorder_dst) {
        auto reorder_dst = reorder(pool_dst_mem, dst_mem);
        reorder_dst.execute(
                s, {{DNNL_ARG_FROM, pool_dst_mem}, {DNNL_ARG_TO, dst_mem}});
        s.wait();
    }
    // [Reorder destination data if necessary]
}

extern "C" int memory_format_propagation() {
    try {
        memory_format_propagation_tutorial(parse_engine_kind(1, NULL));  //CPU
    } catch (dnnl::error &e) {
        printf("Intel(R) DNNL: memory_format_propagation.cpp: failed!!!: status:%s\n", e.status);
        return 1;
    } catch (std::string &e) {
        printf("Intel(R) DNNL: memory_format_propagation.cpp: failed!!!\n");
        return 2;
    }

    printf("Intel(R) DNNL: memory_format_propagation.cpp: passes\n");
    return 0;
}

/// Upon compiling and run the example the output should be just:
///
/// ~~~sh
/// Example passes
/// ~~~
///
/// It may be interesting to check what really happens during the run. We can
/// use `DNNL_VERBOSE` environment variable for that (see also @ref
/// dev_guide_verbose). Here's example output on a system that has an Intel(R)
/// AVX2-capable processor (line breaks added for readability):
///
/// ~~~sh
/// $ DNNL_VERBOSE=1 ./memory_format_propagation
/// dnnl_verbose,info,DNNL <ver> (Git Hash <hash>),Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
/// dnnl_verbose,exec,reorder,jit:uni,undef,
///     src_f32::blocked:abcd:f0 dst_f32::blocked:aBcd8b:f0,num:1,1x256x14x14,1.03101
/// dnnl_verbose,exec,reorder,jit:uni,undef,
///     src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,num:1,256x256x3x3,5.69678
/// dnnl_verbose,exec,convolution,jit:avx2,forward_inference,
///     src_f32::blocked:aBcd8b:f0 wei_f32::blocked:ABcd8b8a:f0 dst_f32::blocked:aBcd8b:f0,
///     alg:convolution_direct,mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1,1.65698
/// dnnl_verbose,exec,pooling,jit:avx,forward_inference,
///     src_f32::blocked:aBcd8b:f0 dst_f32::blocked:aBcd8b:f0,
///     alg:pooling_max,mb1ic256_ih14oh14kh3sh1ph1_iw14ow14kw3sw1pw1,0.322021
/// dnnl_verbose,exec,reorder,jit:uni,
///     undef,src_f32::blocked:aBcd8b:f0 dst_f32::blocked:abcd:f0,num:1,1x256x14x14,0.333008
/// Example passes
/// ~~~
///
/// From this output we can deduce that:
/// * The convolution primitive picked up @ref
///   dnnl::memory::format_tag::aBcd8b optimized memory format for
///   activations. In this format the channels dimension (denoted by letter B
///   since it is the second dimension; see also @ref dev_guide_conventions)
///   is blocked by a factor of 8. Because of this memory format is different
///   from the NCHW format the tutorial uses, the source and destination had
///   to be reordered to and from this optimized memory layout.
/// * The convolution primitive picked up @ref
///   dnnl::memory::format_tag::ABcd8b8a optimized memory format (output (A)
///   and input (B) channel dimensions blocked by 8) which we also had to
///   reorder the initial weights to since they are in the OIHW memory format.
///
/// @page memory_format_propagation_cpp
